The purpose of the case study is to classify a given silhouette as one of four different types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars. The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Link to the case file: vehicle.csvView in a new window
The points distribution for this case is as follows:
# Numerical libraries
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
# to handle data in form of rows and columns
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist #Pairwise distribution between data points
from scipy.cluster.hierarchy import fcluster
from sklearn import metrics
from sklearn.linear_model import LinearRegression
vehicles_df = pd.read_csv("vehicle.csv")
vehicles_df.head(2)
vehicles_df.shape
vehicles_df.info()
vehicles_df.describe().T
vehicles_df['class'].value_counts()
plt.figure()
pd.Series(vehicles_df['class']).value_counts().sort_index().plot(kind = 'bar')
plt.ylabel("Count")
plt.xlabel("Vehicle Type")
plt.title('Number of vehicles');
vehicles_df.nunique()
vehicles_df.columns[vehicles_df.isna().any() == True]
vehicles_df['class'].value_counts()
plt.figure(figsize=(15,8))
pd.Series(vehicles_df.nunique().sort_index()).plot(kind = 'bar')
plt.ylabel("Distinct value Count")
plt.xlabel("Column Name")
plt.title('Distinct value Count');
plt.figure(figsize=(15,8))
sns.boxplot(data=vehicles_df, orient="h", palette="Set2")
vehicles_df.columns
from matplotlib import gridspec
fig, axs = plt.subplots(6, 3,figsize=(15,25))
for i,el in enumerate(list(vehicles_df.columns.values)[:-1]):
sns.boxplot(x='class',y=el,data = vehicles_df, ax=axs[int(i/3)][i%3])
vehicles_df.median().values
vehicles_df.head(10)
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
vehicles_df_X = vehicles_df.drop('class',axis=1)
vehicles_df_X.sample()
imputer.fit(vehicles_df_X)
imputer.statistics_
X = imputer.transform(vehicles_df_X)
vehicles_df_X = pd.DataFrame(X, columns=vehicles_df_imputed.columns,
index=vehicles_df_imputed.index)
vehicles_df_X.info()
#vehicles_df_outliers = vehicles_df_imputed[['radius_ratio','pr.axis_aspect_ratio','max.length_aspect_ratio',,
#'scaled_variance.1','scaled_radius_of_gyration.1']]
Q3 = vehicles_df_X.quantile(0.75)
Q1 = vehicles_df_X.quantile(0.25)
vehicles_df_outliers_iqr = Q3 - Q1
mini = Q1 - 1.5*vehicles_df_outliers_iqr
maxi = Q3 + 1.5*vehicles_df_outliers_iqr
vehicles_df_outliers_iqr
#### All the outliers lie on the right so we will consider maximum
maximum_radius_ratio_ids = vehicles_df_X[vehicles_df_X['radius_ratio']>maxi['radius_ratio']]['radius_ratio'].index
maximum_axis_aspect_ratio_ids = vehicles_df_X[vehicles_df_X['pr.axis_aspect_ratio']>maxi['pr.axis_aspect_ratio']]['pr.axis_aspect_ratio'].index
maximum_length_aspect_ratio_ids = vehicles_df_imputed[vehicles_df_X['max.length_aspect_ratio']>maxi['max.length_aspect_ratio']]['max.length_aspect_ratio'].index
maximum_scaled_radius_of_gyration_1_ids = vehicles_df_X[vehicles_df_X['scaled_radius_of_gyration.1']>maxi['scaled_radius_of_gyration.1']]['scaled_radius_of_gyration.1'].index
print(maximum_radius_ratio_ids.size)
print(maximum_axis_aspect_ratio_ids.size)
print(maximum_length_aspect_ratio_ids.size)
print(maximum_scaled_radius_of_gyration_1_ids.size)
print(vehicles_df.loc[maximum_radius_ratio_ids]['class'].value_counts())
print(vehicles_df.loc[maximum_axis_aspect_ratio_ids]['class'].value_counts())
print(vehicles_df.loc[maximum_length_aspect_ratio_ids]['class'].value_counts())
print(vehicles_df.loc[maximum_scaled_radius_of_gyration_1_ids]['class'].value_counts())
vehicles_df_X_updated = vehicles_df_imputed[vehicles_df_X < maxi].apply(lambda x: x.fillna(x.median()),axis=0)
vehicles_df_X_updated.describe().T
vehicles_df_X_updated['class'] = vehicles_df['class']
sns.pairplot(vehicles_df_X_updated, diag_kind='kde',hue='class')
#vehicles_df_updated = pd.get_dummies(vehicles_df_updated, columns=['class'])
# import labelencoder
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()
vehicles_df_updated['class'] = le.fit_transform(vehicles_df_updated['class'])
vehicles_df_updated['class'].value_counts()
list(le.classes_)
vehicles_df_imputed_corr = vehicles_df_updated.corr()
vehicles_df_imputed_corr
plt.figure(figsize=(15,10))
sns.heatmap(vehicles_df_imputed_corr, annot=True)
vehicles_df_imputed_corr[abs(vehicles_df_imputed_corr) > (0.70)]
X = vehicles_df_updated.drop(['class'],axis=1)
y = vehicles_df_updated['class']
sc = StandardScaler()
X_std = sc.fit_transform(X)
cov_matrix = np.cov(X_std.T)
pd.DataFrame(cov_matrix, columns=X.columns, index=X.columns)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
pd.DataFrame(eigenvalues, columns=['Eigen Values']).T
pd.DataFrame(eigenvectors, index=X.columns)
variance = pd.DataFrame(eigenvalues, columns=['Variance of Eigen Vector']).T
variance = variance/eigenvalues.sum()
variance = variance.sort_values(axis=1, by='Variance of Eigen Vector', ascending=False)
variance.loc['Cumulative Variance of Eigen Vector'] = [n for n in variance.loc['Variance of Eigen Vector'].cumsum()]
variance
plt.figure(figsize=(20,10))
sns.barplot(x = variance.columns, y = variance.loc['Variance of Eigen Vector'], order=list(variance.columns))
ax = sns.lineplot(x = variance.columns.sort_values(), y = variance.loc['Cumulative Variance of Eigen Vector'])
plt.grid(axis='both')
[ax.text(p[0]+0.20, p[1], '{0:.4f}'.format(p[1]), color='g') for p in zip(ax.get_xticks(), variance.loc['Cumulative Variance of Eigen Vector'])]
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
pca.fit(X_std)
pd.DataFrame(pca.components_, columns=X.columns)
pca.explained_variance_
pca.explained_variance_ratio_
X_pca = pca.transform(X_std)
X_pca.shape
Proj_data_df = pd.DataFrame(X_pca);
sns.pairplot(Proj_data_df, diag_kind='kde')
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(Proj_data_df,y, test_size = 0.2, random_state = 10)
X_train.head(2)
from sklearn.svm import SVC
# Building a Support Vector Machine on train data
svc_model_poly = SVC(C= .1, kernel='poly', gamma= 1)
svc_model_linear = SVC(C= .1, kernel='linear', gamma= 1)
def SVMModel(model,X_train, y_train):
print(model)
model.fit(X_train, y_train)
prediction = model.predict(X_test)
# check the accuracy on the training set
print(model.score(X_test, y_test))
print("Confusion Matrix:\n",confusion_matrix(prediction,y_test))
SVMModel(svc_model_linear,X_train, y_train)
SVMModel(svc_model_poly,X_train, y_train)
#importing modules
from sklearn.model_selection import GridSearchCV
from sklearn import svm
#making the instance
svc_model=svm.SVC()
#Hyper Parameters Set
params = {'C': [0.01, 0.05, 0.5, 1],
'kernel': ['linear','rbf','poly']}
#Making models with hyper parameters sets
model_gridSearchCV = GridSearchCV(svc_model, param_grid=params, n_jobs=-1, cv=10)
#Learning
model_gridSearchCV.fit(X_train,y_train)
#The best hyper parameters set
print("Best Hyper Parameters:\n",model_gridSearchCV.best_params_)
#Prediction
prediction2=model_gridSearchCV.predict(X_test)
#importing the metrics module
from sklearn import metrics
#evaluation(Accuracy)
print("Accuracy:",metrics.accuracy_score(prediction2,y_test))
#evaluation(Confusion Metrix)
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction2, y_test))
(Stratified)KFold